One could apply the following methods to the data.
Install package checkpoint prior to running to following code in your shell: - install.packages(checkpoint) - checkpoint(“2019-09-20”)
# library for TSNE
library(Rtsne)
library(corrplot)
library(gplots)
library(gplots) # for making own color code (e.g. for heatmap representation)
library(kohonen)
library(plot3D)
library(factoextra) # more complexe library than standard R library; all methods for distance & linkage available
library(stringr)
library(dplyr)
library(dendextend)
library(ape)
library(RColorBrewer) # for making own color code (e.g. for heatmap representation)
# RColorBrewer is used to show different shades of colours
library(RColorBrewer)
library(rmarkdown)
library(DataExplorer)
library(tidyverse) # SOM
library(kohonen) # SOM
library(checkpoint)
One can you use this decision support tool to get the suitable methods for the predominant problem:
https://mod.rapidminer.com/#app
# clean the memory
rm(list = ls(all = TRUE))
# load data
df <- read.csv("winequality-red.csv")
# count the missing values (NA) in the data frame
sum(is.na(df))
## [1] 0
# check if the columns are numeric
sapply(df, is.numeric)
## fixed.acidity volatile.acidity citric.acid
## TRUE TRUE TRUE
## residual.sugar chlorides free.sulfur.dioxide
## TRUE TRUE TRUE
## total.sulfur.dioxide density pH
## TRUE TRUE TRUE
## sulphates alcohol quality
## TRUE TRUE TRUE
# scale the data
df_scaled <- scale(df)
class(df_scaled) # is now a matrix
## [1] "matrix"
as_tibble(df_scaled)
glimpse(df)
## Observations: 1,599
## Variables: 12
## $ fixed.acidity <dbl> 7.4, 7.8, 7.8, 11.2, 7.4, 7.4, 7.9, 7.3, ...
## $ volatile.acidity <dbl> 0.700, 0.880, 0.760, 0.280, 0.700, 0.660,...
## $ citric.acid <dbl> 0.00, 0.00, 0.04, 0.56, 0.00, 0.00, 0.06,...
## $ residual.sugar <dbl> 1.9, 2.6, 2.3, 1.9, 1.9, 1.8, 1.6, 1.2, 2...
## $ chlorides <dbl> 0.076, 0.098, 0.092, 0.075, 0.076, 0.075,...
## $ free.sulfur.dioxide <dbl> 11, 25, 15, 17, 11, 13, 15, 15, 9, 17, 15...
## $ total.sulfur.dioxide <dbl> 34, 67, 54, 60, 34, 40, 59, 21, 18, 102, ...
## $ density <dbl> 0.9978, 0.9968, 0.9970, 0.9980, 0.9978, 0...
## $ pH <dbl> 3.51, 3.20, 3.26, 3.16, 3.51, 3.51, 3.30,...
## $ sulphates <dbl> 0.56, 0.68, 0.65, 0.58, 0.56, 0.56, 0.46,...
## $ alcohol <dbl> 9.4, 9.8, 9.8, 9.8, 9.4, 9.4, 9.4, 10.0, ...
## $ quality <int> 5, 5, 5, 6, 5, 5, 5, 7, 7, 5, 5, 5, 5, 5,...
# View(df)
summary(df)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.01200 Min. : 1.00 Min. : 6.00
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00
## Median :0.07900 Median :14.00 Median : 38.00
## Mean :0.08747 Mean :15.87 Mean : 46.47
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00
## Max. :0.61100 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 8.40
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.20
## Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.90
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.636
## 3rd Qu.:6.000
## Max. :8.000
# create_report(df)
Available metrics in the returend k-Means object
# Run K-means for k=2
k <- 2 #defining two clusters
km.out <- kmeans(df_scaled, k,nstart =50)
plot(df_scaled, col =(km.out$cluster+1), main ="K-Means Clustering", xlab ="", ylab ="", pch =20, cex =2)
# Check the outcome
km.out
## K-means clustering with 2 clusters of sizes 585, 1014
##
## Cluster means:
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 0.8858867 -0.7295470 0.9891242 0.12584600 0.2323040
## 2 -0.5110885 0.4208925 -0.5706485 -0.07260346 -0.1340215
## free.sulfur.dioxide total.sulfur.dioxide density pH
## 1 -0.1781002 -0.2488421 0.3902513 -0.6740906
## 2 0.1027501 0.1435627 -0.2251450 0.3888984
## sulphates alcohol quality
## 1 0.5693024 0.2903706 0.4676411
## 2 -0.3284437 -0.1675215 -0.2697930
##
## Clustering vector:
## [1] 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 1 2 1 2 2 2 2 1 2 2 2 2 2 2
## [35] 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## [69] 1 2 2 2 2 2 1 1 1 2 2 2 2 1 2 1 2 2 1 2 1 2 2 1 1 2 2 2 2 2 2 2 2 2
## [103] 2 2 2 2 1 2 1 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [137] 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1
## [171] 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 2
## [205] 2 1 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## [239] 2 2 1 1 2 1 1 2 2 2 2 2 1 2 1 2 2 2 1 2 1 1 2 2 2 2 1 1 2 1 2 1 2 1
## [273] 1 2 2 2 2 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 2 1 1 2 2 2 2 2 1 2 2 2 1
## [307] 2 1 1 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1 2 2 2 1 1 2 1 1
## [341] 1 1 1 1 1 2 2 1 1 2 1 2 2 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2
## [375] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 2 1 1 2 1 1 2 1 1 2 2 2 1 1 2 1 1 1
## [409] 1 1 2 2 2 1 2 2 1 2 1 2 1 2 2 1 2 2 2 2 2 1 1 2 1 1 1 1 2 1 1 2 1 1
## [443] 1 1 2 2 1 1 2 1 1 1 2 1 2 1 2 2 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2
## [477] 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 1 2 2 1 1 1 1 1 1 1 1 1
## [511] 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 2 1
## [545] 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 1 1 2 2
## [579] 2 1 1 1 1 1 1 2 1 2 2 1 1 2 1 1 2 2 1 1 2 1 2 1 2 1 2 2 1 1 1 2 1 1
## [613] 2 1 1 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 1 2 1 2 1 2
## [647] 2 2 1 2 1 2 1 1 1 2 1 1 2 2 2 2 2 1 1 2 1 1 1 1 2 2 2 2 1 1 1 2 2 1
## [681] 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2
## [715] 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2
## [749] 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2
## [783] 2 2 2 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 1 1 1 2 2 2 1 1 2 1 1
## [817] 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2
## [851] 1 1 1 1 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 1 2
## [885] 2 2 2 1 2 1 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2
## [919] 1 2 1 1 2 2 1 1 1 2 1 1 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1
## [953] 1 1 2 1 1 1 2 2 1 2 2 1 1 1 1 2 1 2 1 1 1 2 1 2 2 2 2 1 1 2 2 1 1 2
## [987] 1 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 1 2 2 2 1 1 1 1 2
## [1021] 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 2 2 1 1 2 2 2 2 1 1 2 1 2 1
## [1055] 2 2 1 2 1 1 1 1 1 1 2 2 2 1 1 2 1 2 2 2 2 1 1 1 1 1 1 1 2 1 2 2 1 1
## [1089] 1 1 1 1 2 1 2 1 2 2 1 2 1 2 2 2 2 2 1 1 2 1 2 2 1 1 2 2 2 2 2 2 1 2
## [1123] 2 1 2 1 2 2 1 1 2 2 1 2 1 1 1 1 2 2 2 1 2 2 2 1 2 1 1 1 1 2 2 1 2 2
## [1157] 1 2 1 1 1 1 1 2 2 1 1 1 2 2 1 2 1 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2
## [1191] 1 2 1 2 2 2 2 2 1 2 2 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1 1 1 2 1
## [1225] 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1259] 2 2 1 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 2 2 2 2
## [1293] 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 1 1 2 2
## [1327] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1
## [1361] 2 2 1 2 2 2 2 1 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1395] 2 2 2 2 2 2 2 2 1 1 2 1 1 2 1 2 2 2 1 1 1 2 1 1 2 2 2 2 2 2 1 1 1 2
## [1429] 2 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 1 1 2 2
## [1463] 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 2 1 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2
## [1497] 2 2 2 2 2 2 2 2 1 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2
## [1531] 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1565] 2 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2
## [1599] 2
##
## Within cluster sum of squares by cluster:
## [1] 7434.904 8334.629
## (between_SS / total_SS = 17.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
km.out$withinss
## [1] 7434.904 8334.629
cat("withinss =", km.out$tot.withinss)
## withinss = 15769.53
# Find best value for k
wss <- 0 # intialise
wss_dif <- 0
number_of_clusters_tested <- 20
for (i in 1:number_of_clusters_tested){
km.out <- kmeans(df_scaled,i,nstart =50)
wss[i] <- km.out$tot.withinss
if(i > 1){ # only enter condition for two clusters and higher
wss_dif[i-1] <- wss[i-1]-wss[i] # take difference from previous "total within-cluster sum of squares" and current one.
}
}
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
plot(1:number_of_clusters_tested, wss, type="b", xlab="Number of Clusters",ylab="Total within-cluster sum of squares")
plot(2:number_of_clusters_tested, wss_dif, type="b", xlab="Number of Clusters",ylab="Difference between Total within-cluster sum of squares")
Interpretation:
*First plot: K-means Clustering with k=2 is not very useful. This plot is only useful if you have a dataset with only two dimensions.
*Second plot: With increasing number of clusters the total within cluster sum of squares decreases always. To decide which k is the best, look for the ellbow in the plot. In this plot there is no clear ellbow visible. Therefore we go for k = 2.
<<<<<<< HEAD *Third plot: This plot shows the differnce between the i-th and the i-1th value of the total within-cluster sum of squares. This plot should be used with caution. ======= Third plot: This plot shows the differnce between the i-th and the i-1th value of the total within-cluster sum of squares. This plot should be viewed with caution as it compares the sum of squares to the previous (k-1) sum of squares. An alternative would be to subtract the mean from every sum of squares. >>>>>>> c216d2ff096e9a59e112e6360fa65339d581f011
Available metrics in the returend HC object
# Calculate distances between data (default method = euclidean):
distances <- dist(df_scaled, method = "euclidean")
# Compute hierarchical clustering based in distances calculated above:
hc <- hclust(distances)
# Computes dendrogram graphical representation:
dend <- as.dendrogram(hc)
# Graphical representation
plot(dend, main = "Dendogram plot")
# Alternative, standard output representation (can be useful for ctrl+find specific things in big trees)
# str(dend) # computionally very expensive, takes about 2 minutes
# Transpose data:
df_transposed <- t(df_scaled)
# Calculate distances between data (default method = euclidean):
distances_t <- dist(df_transposed, method = "euclidean")
# Compute hierarchical clustering based in distances calculated above:
hc_t <- hclust(distances_t)
# Computes dendrogram graphical representation:
dend_t <- as.dendrogram(hc_t)
# Graphical representation
plot(dend_t, main = "Transposed Dendogram plot")
# Alternative, standard output representation (can be useful for ctrl+find specific things in big trees)
str(dend_t)
## --[dendrogram w/ 2 branches and 12 members at h = 73.3]
## |--[dendrogram w/ 2 branches and 5 members at h = 53.8]
## | |--[dendrogram w/ 2 branches and 2 members at h = 44.8]
## | | |--leaf "chlorides"
## | | `--leaf "sulphates"
## | `--[dendrogram w/ 2 branches and 3 members at h = 45.1]
## | |--leaf "density"
## | `--[dendrogram w/ 2 branches and 2 members at h = 32.4]
## | |--leaf "fixed.acidity"
## | `--leaf "citric.acid"
## `--[dendrogram w/ 2 branches and 7 members at h = 66.7]
## |--[dendrogram w/ 2 branches and 2 members at h = 40.9]
## | |--leaf "alcohol"
## | `--leaf "quality"
## `--[dendrogram w/ 2 branches and 5 members at h = 58.9]
## |--[dendrogram w/ 2 branches and 2 members at h = 49.4]
## | |--leaf "volatile.acidity"
## | `--leaf "pH"
## `--[dendrogram w/ 2 branches and 3 members at h = 51]
## |--leaf "residual.sugar"
## `--[dendrogram w/ 2 branches and 2 members at h = 32.6]
## |--leaf "free.sulfur.dioxide"
## `--leaf "total.sulfur.dioxide"
# Use a specific linkage or distance methode in hc clustering
# Find more information about distances and which one to choose: https://www.datanovia.com/en/lessons/clustering-distance-measures/
# Possible distance methods: "euclidean", "maximum", "manhattan","canberra","binary", "minkowski"
# Find more information about linkage method and which one to choose: https://www.datanovia.com/en/lessons/agglomerative-hierarchical-clustering/#linkage
# Possible linkage methods: "ward.D", "ward.D2", "single", "complete","average", "mcquitty", "median","centroid"
# Compute hierarchical clustering with euclidean distance and complete method:
hc <- hclust(dist(df_scaled, method = "euclidean"),method="complete")
plot(hc, main = "Dendogram Plot with Euclidean distance, method complete")
# Missing correlation-based methods (spearman, kendall): using library(factoextra) for these methods
# Compute the dissimilarity matrix with different distance types: "euclidean", "manhattan", "pearson", "spearman", "kendall"
res.dist <- get_dist(df_scaled, method = "kendall")
# Visualize the dissimilarity matrix
fviz_dist(res.dist, lab_size = 8)
# Compute hierarchical clustering with different linkage types:"single", "complete", "average", "centroid", "ward.D", "ward.D2"
res.hc <- hclust(res.dist, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
# Visualize the tree
fviz_dend(res.hc)
# Or simply
plot(res.hc, main = "HC with ward linkage type")
# Create heatmap with scaled data
heatmap(df_scaled)
# Make your own color code brewer.pal(n, name)
# n Number of different colors in the palette, minimum 3, maximum depending on palette
# name Blues BuGn BuPu GnBu Greens Greys Oranges OrRd PuBu PuBuGn PuRd Purples RdPu Reds YlGn YlGnBu YlOrBr YlOrRd
palette <- colorRampPalette(brewer.pal(9, "YlGnBu"))
# Row- and column-wise clustering, with wished linkage and distance method; hc1 with regular data and hc2 with transposed data:
hc1 <- hclust(as.dist(1-cor(t(df_scaled), method="kendall")), method="ward.D2")
hc2 <- hclust(as.dist(1-cor(df_scaled, method="spearman")), method="single")
# Plot heatmap: Heatmap.2 (included in the Gplots Library) allows to define linkage & distance methods for heatmap representaton
heatmap.2(df_scaled, Rowv=as.dendrogram(hc1), Colv=as.dendrogram(hc2), col=palette,scale="row", density.info="none", trace="none")
Interpretation:
*Dendogram plot: This plot is not very useful. It contains a lot of overloaded information. The height of the cut in the dendogram has comparable role as the number of K in K-means clustering: it controls the number of clusters obtained.
*Transposed Dendogram plot: This plot is way more useful and contains information to work with. The lower the altitude of a branch is, the closer the predictors are to each other. For example fixed.acidity and citric.acid have the lowest branch hight which means that these two predictors are closely related to each other.
*Dissimilarity Matrix: tbd
*Heatmaps: tbd
Available metrics in the returend TSNE object
https://www.analyticsvidhya.com/blog/2017/01/t-sne-implementation-r-python/
# define parameter
dims <- "tbd"
# ...
df_scaled_df <- as.data.frame(df_scaled)
# datapoint later in the plot: data_label<-as.factor(rownames(data))
# Run tSNE:
# we need to remove duplicates
tsne <- Rtsne(df_scaled_df[!duplicated(df_scaled_df), ], dims = 2,
perplexity=50, verbose=TRUE,
max_iter = 500)
## Performing PCA
## Read the 1359 x 12 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.40 seconds (sparsity = 0.154832)!
## Learning embedding...
## Iteration 50: error is 67.308576 (50 iterations in 0.26 seconds)
## Iteration 100: error is 67.288818 (50 iterations in 0.31 seconds)
## Iteration 150: error is 66.295658 (50 iterations in 0.22 seconds)
## Iteration 200: error is 66.293890 (50 iterations in 0.18 seconds)
## Iteration 250: error is 66.294082 (50 iterations in 0.18 seconds)
## Iteration 300: error is 1.397036 (50 iterations in 0.21 seconds)
## Iteration 350: error is 1.245770 (50 iterations in 0.18 seconds)
## Iteration 400: error is 1.194244 (50 iterations in 0.19 seconds)
## Iteration 450: error is 1.175803 (50 iterations in 0.19 seconds)
## Iteration 500: error is 1.170494 (50 iterations in 0.19 seconds)
## Fitting performed in 2.11 seconds.
## 1 ## This first section concerning colouring allows us to colour scaled numeric columns
#Create a function to generate a continuous color palette
rbPal <- colorRampPalette(c('red','yellow'))
#This adds a column of color values
# creates 10 colour buckets based on quality
df_scaled_df$Col_qual <- rbPal(8)[as.numeric(cut(df_scaled_df$quality,breaks = 10))]
# creates 10 colour buckets based on ph
df_scaled_df$Col_pH <- rbPal(7)[as.numeric(cut((df_scaled_df$pH),breaks = 10))]
## 2 ## We can choose colouring to be df_scaled_df$Col or e.g. df$quality
# Plot data and labels:
plot(tsne$Y, col=df_scaled_df$Col_qual, pch=16, main = "TSNE coloured by quality")
plot(tsne$Y, col=df_scaled_df$Col_pH, pch=16, main = "TSNE coloured by pH")
str(tsne)
## List of 14
## $ N : int 1359
## $ Y : num [1:1359, 1:2] 8.83 5.67 5.78 -10.77 8.81 ...
## $ costs : num [1:1359] 0.000511 0.000979 0.001495 0.000764 0.000481 ...
## $ itercosts : num [1:10] 67.3 67.3 66.3 66.3 66.3 ...
## $ origD : int 12
## $ perplexity : num 50
## $ theta : num 0.5
## $ max_iter : num 500
## $ stop_lying_iter : int 250
## $ mom_switch_iter : int 250
## $ momentum : num 0.5
## $ final_momentum : num 0.8
## $ eta : num 200
## $ exaggeration_factor: num 12
Interpretation:
Available metrics in the returend PCA object
# Compute PCA.
# scale=TRUE to scale the variables to have standard deviation = 1
pca_out = prcomp(df, scale=TRUE)
# Show available metrics computed by PCA:
names(pca_out)
## [1] "sdev" "rotation" "center" "scale" "x"
# Access metrics computed by PCA
pca_out$sdev # show sdev
## [1] 1.7666827 1.4972916 1.2972739 1.1022799 0.9865412 0.8139977 0.7863319
## [8] 0.7112472 0.6413326 0.5726425 0.4245216 0.2439629
pca_out$center # show mean
## fixed.acidity volatile.acidity citric.acid
## 8.31963727 0.52782051 0.27097561
## residual.sugar chlorides free.sulfur.dioxide
## 2.53880550 0.08746654 15.87492183
## total.sulfur.dioxide density pH
## 46.46779237 0.99674668 3.31111320
## sulphates alcohol quality
## 0.65814884 10.42298311 5.63602251
pca_out$scale # show scale
## fixed.acidity volatile.acidity citric.acid
## 1.741096318 0.179059704 0.194801137
## residual.sugar chlorides free.sulfur.dioxide
## 1.409928060 0.047065302 10.460156970
## total.sulfur.dioxide density pH
## 32.895324478 0.001887334 0.154386465
## sulphates alcohol quality
## 0.169506980 1.065667582 0.807569440
# Rotation matrix provides the principal component of the loadings.
dim(pca_out$rotation) # p*p matrix whereas p is the number of variables (loadings)
## [1] 12 12
pca_out$rotation
## PC1 PC2 PC3 PC4
## fixed.acidity 0.487883358 -0.004173212 0.16482854 0.231098077
## volatile.acidity -0.265128984 0.338967858 0.22708884 -0.041858245
## citric.acid 0.473335467 -0.137358104 -0.10022856 0.056735802
## residual.sugar 0.139154423 0.167736336 -0.24362014 0.383037581
## chlorides 0.197426792 0.189788185 0.02660785 -0.654777820
## free.sulfur.dioxide -0.045880713 0.259483136 -0.61611132 0.033711483
## total.sulfur.dioxide 0.004066746 0.363971374 -0.54073214 0.028459726
## density 0.370301191 0.330780789 0.16872267 0.200693412
## pH -0.432720849 -0.065440145 -0.06977056 0.005466181
## sulphates 0.254535354 -0.109333620 -0.21291324 -0.560502367
## alcohol -0.073176777 -0.502708647 -0.22497138 0.091701428
## quality 0.112488776 -0.473166214 -0.22336929 0.036669226
## PC5 PC6 PC7 PC8
## fixed.acidity -0.07877938 0.05553130 -0.3072150 0.20052866
## volatile.acidity 0.29937933 0.29728700 -0.6262337 0.14612614
## citric.acid -0.12014871 0.13663328 0.2441486 0.29633271
## residual.sugar 0.70936319 0.10931059 0.2838543 -0.17062614
## chlorides 0.26623723 0.33733656 0.2305470 -0.18692254
## free.sulfur.dioxide -0.15941286 -0.04264807 -0.1382604 -0.01935607
## total.sulfur.dioxide -0.21845284 0.11595360 -0.1102087 0.08989655
## density 0.20879298 -0.42566742 -0.1225465 0.07950023
## pH 0.25764682 -0.48035396 0.1856917 0.31469303
## sulphates 0.21483493 -0.40374303 -0.2334021 0.27549158
## alcohol 0.25972635 0.39217625 -0.1217188 0.47118865
## quality 0.13758414 -0.14183046 -0.4123879 -0.61224719
## PC9 PC10 PC11 PC12
## fixed.acidity -0.17457815 0.182956014 -0.256437921 0.638579761
## volatile.acidity -0.06022334 -0.155105626 0.377161229 0.004661681
## citric.acid -0.22097505 -0.346085556 0.624327833 -0.070036908
## residual.sugar 0.27818728 0.052236558 0.088077871 0.183646374
## chlorides -0.41993639 0.003862734 -0.208616670 0.053931176
## free.sulfur.dioxide -0.31800012 0.585388580 0.237933171 -0.051921666
## total.sulfur.dioxide 0.12182276 -0.589188239 -0.355046842 0.069792953
## density -0.24907449 -0.043538098 -0.231453058 -0.566644992
## pH -0.46191598 -0.207609889 -0.005599072 0.341230056
## sulphates 0.45268884 0.071918570 0.097637028 0.067792979
## alcohol -0.09652795 0.110605247 -0.319948696 -0.317640325
## quality -0.24024309 -0.260239788 0.052465707 0.008470476
# x matrix provides the principal component of the scores.
dim(pca_out$x) # n * p matrix whereas n is the number of observations and p is the number of principal componentes
## [1] 1599 12
# pca_out$x # not executed due to size of the matrix
Interpretation:
# Create Biplot
# scale=0 ensures that the arrows are scaled to represent the loadings;
# other values for scale give slightly different biplots with different interpretations.
biplot(pca_out,scale=0)
# Create Biplot with reduced sample to get a better overview
# Sample Data
set.seed(2)
sample_1=sample(1:nrow(df), nrow(df)/3) # sample a third of the data for training data
df_reduced=df[sample_1,]
# Calculate PCA on reduced dataset
pca_out_reduced = prcomp(df_reduced, scale=TRUE)
# Create Biplot on reduced dataset
biplot(pca_out_reduced,scale=0)
# create_report(df)
plot(df$quality,df$alcohol)
plot(df$fixed.acidity,df$pH)
plot(df$quality,df$total.sulfur.dioxide)
plot(df$chlorides,df$density)
summary() on a prcomp object Summary shows the standard deviation, proportion of variance and cumulative proportion of variance of each principal component
Result: 6 Principal Components explain more than 80% of the variance in the data.
# screeplot shows the variance of each prinicipal component
screeplot(pca_out)
# Alternative:
# It's more usueful to have the proportion (%) of variance of each principal component
# Numeric values about proportion of variance can be retrieved with the summary function
summary(pca_out)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.7667 1.4973 1.2973 1.1023 0.98654 0.81400 0.78633
## Proportion of Variance 0.2601 0.1868 0.1402 0.1013 0.08111 0.05522 0.05153
## Cumulative Proportion 0.2601 0.4469 0.5872 0.6884 0.76952 0.82474 0.87626
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.71125 0.64133 0.57264 0.42452 0.24396
## Proportion of Variance 0.04216 0.03428 0.02733 0.01502 0.00496
## Cumulative Proportion 0.91842 0.95270 0.98002 0.99504 1.00000
# Compute the proportion of variance explained by each principal component
# Calculation: variance explained by each principal component / total variance explained by all principal components)
# Calculate variance of prinicipal components
pca_out_var <- pca_out$sdev^2
# calculate proportion of variance for each principal component
pve <- pca_out_var/sum(pca_out_var)
pve
## [1] 0.260097308 0.186823504 0.140243308 0.101251739 0.081105302
## [6] 0.055216020 0.051526483 0.042156046 0.034275628 0.027326616
## [11] 0.015018219 0.004959826
# plots
plot(pve, main="Proporation of Variance per PC",xlab="Principal Component",ylab="Proportion of Variance Explained",ylim=c(0,1),type='b')
plot(cumsum(pve),main="Cumulative Proporation of Variance per PC",xlab="PrincipalComponent",ylab="Cumulative Proportion of Variance
Explained",ylim=c(0,1),type='b')
Purpose: Data visualization of high-dimensional data. How: Mapping from a higher-dimensional input space to a lower-dimensional map space with competitive learning.
# preprocess data for model
df_unique <- unique(df) # remove duplicates, here 200 data rows
glimpse(df_unique)
## Observations: 1,359
## Variables: 12
## $ fixed.acidity <dbl> 7.4, 7.8, 7.8, 11.2, 7.4, 7.9, 7.3, 7.8, ...
## $ volatile.acidity <dbl> 0.700, 0.880, 0.760, 0.280, 0.660, 0.600,...
## $ citric.acid <dbl> 0.00, 0.00, 0.04, 0.56, 0.00, 0.06, 0.00,...
## $ residual.sugar <dbl> 1.9, 2.6, 2.3, 1.9, 1.8, 1.6, 1.2, 2.0, 6...
## $ chlorides <dbl> 0.076, 0.098, 0.092, 0.075, 0.075, 0.069,...
## $ free.sulfur.dioxide <dbl> 11, 25, 15, 17, 13, 15, 15, 9, 17, 15, 16...
## $ total.sulfur.dioxide <dbl> 34, 67, 54, 60, 40, 59, 21, 18, 102, 65, ...
## $ density <dbl> 0.9978, 0.9968, 0.9970, 0.9980, 0.9978, 0...
## $ pH <dbl> 3.51, 3.20, 3.26, 3.16, 3.51, 3.30, 3.39,...
## $ sulphates <dbl> 0.56, 0.68, 0.65, 0.58, 0.56, 0.46, 0.47,...
## $ alcohol <dbl> 9.4, 9.8, 9.8, 9.8, 9.4, 9.4, 10.0, 9.5, ...
## $ quality <int> 5, 5, 5, 6, 5, 5, 7, 7, 5, 5, 5, 5, 5, 5,...
data <-
as.matrix(scale(df_unique[, 1:11])) # scale: mean = 0, sd = 1
# show feature names
dimnames(data)[2] # label successfully removed
## [[1]]
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol"
# For plotting evaluation against colorcode
# category (~ classification solution)
row_label <- as.factor(rownames(data))
qualities <- as.character(df$quality)
data_train_matrix <- as.matrix(scale(data))
Removed wine quality label and transformed to scaled matrix which serves as input for the SOM model.
General tipps regarding usage of SOMs:
# set up model
set.seed(22) # for reproducible results
# get heuristic number of neurons for the following grid
# use this heuristic from Alexander Maier https://www.researchgate.net/post/How_many_nodes_for_self-organizing_maps
neurons <- 5*sqrt(nrow(df))
neuron_per_grid <- round(sqrt(neurons))
# Define the neuronal grid
som_grid <- somgrid(xdim = neuron_per_grid, ydim = neuron_per_grid,
topo = "hexagonal")
# this grid is not really usable because the interpretation is not really possible, so one would choose a really low level for the neurons
# Redefine the neuronal grid
som_grid <- somgrid(xdim = 4, ydim = 4,
topo = "hexagonal")
# Train the model
som_model <- som(
data_train_matrix,
grid = som_grid,
rlen = 1000, # number of iterations
alpha = c(0.05, 0.01), # learning rate
keep.data = TRUE
)
summary(som_model)
## SOM of size 4x4 with a hexagonal topology and a bubble neighbourhood function.
## The number of data layers is 1.
## Distance measure(s) used: sumofsquares.
## Training data included: 1359 objects.
## Mean distance to the closest unit in the map: 4.076.
### !!! continue here !!! ###
# Check training progress
options(scipen = 999) # for better reading
plot(som_model, type = "changes")
# Check how many samples are mapped to each
# node on the map. (5-10 samples per node)
# Explore training results
plot(som_model, type = "count") # how many datapoints are in a neuron
plot(som_model, type = "mapping", # show datapoints per neuron, 5 - 10 datapoints
col = qualities[row_label]) # per neuron is the target range
# one color per wine quality
plot( # show sample of datapoints, wine quality and number of row
som_model,
type = "mapping",
labels = (rownames(data)),
col = qualities[row_label]
)
Model description: SOM of size 4x4 with a hexagonal topology and a bubble neighbourhood function. The number of data layers is 1. Distance measure(s) used: sumofsquares. Training data included: 1359 objects. Mean distance to the closest unit in the map: 4.076.
The Counts and Mapping plot show two cluster of neurons: One with a lot of data points (n > 75) and the other with few data points (n <= 75)
# U-Matrix: measure of distance between each node and its neighbours.
# (Euclidean distance between weight vectors of neighboring neurons)
# Can be used to identify clusters/boundaries within the SOM map.
# Areas of low neighbour distance ~ groups of nodes that are similar.
plot(som_model, type = "dist.neighbours")
# Codes / Weight vectors: representative of the samples mapped to a node.
# highlights patterns in the distribution of samples and variables.
plot(som_model, type = "codes")
# Heatmaps: identify interesting areas on the map.
# Visualise the distribution of a single variable (defined in [,x]) across the map
plot(som_model,
type = "property",
property = getCodes(som_model, 1)[, 2])
# Same as above but with original, unscaled data (can also be useful)
var_unscaled <- aggregate(
as.numeric(df_unique[, 1]),
by = list(som_model$unit.classif),
FUN = mean,
simplify = TRUE
)[, 2]
plot(som_model, type = "property", property = var_unscaled)
# Clustering: isolate groups of samples with similar metrics
tree <-
as.dendrogram(hclust(dist(as.numeric(
unlist(som_model$codes)
))))
plot(tree, ylab = "Height (h)")
# Cut the tree somewhere based on the above tree
som_cluster <-
cutree(hclust(dist(as.numeric(
unlist(som_model$codes)
))),
h = 2) # k groups or at h hight
# Visualize mapping based on HC
pretty_palette <- c("#1f77b4",
'#ff7f0e',
'#2ca02c',
'#d62728',
'#9467bd',
'#8c564b',
'#e377c2')
plot(
som_model,
type = "mapping",
labels = (rownames(data)),
bgcol = pretty_palette[som_cluster],
col = qualities[row_label]
)
add.cluster.boundaries(som_model, som_cluster)
The Codes plot shows the importance of each feature within a single neuron. The closer a value is to the edge of the circle, the more distinctive is the feature. There seems to be just a few neurons with high feature values (~6). The dendogramm contains too many data points and is useless this way. [continue here with mapping plot]
Tipps vom Dozent: